In [1]:
import pandas as pd
import numpy as np

In [7]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [311]:
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)

In [312]:
def get_numpy_data(data_sframe, features, output):
    feature_matrix = data_sframe[features].values
    output_array = data_sframe[[output]].values
    return (feature_matrix, output_array)

In [313]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    return (features/norms, norms)

In [314]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
my_features = list(dtype_dict.keys()- ['id', 'date', 'zipcode', 'price'])
features_train,output_train = get_numpy_data(train, feature_list, 'price')
features_valid,output_valid = get_numpy_data(validation, feature_list, 'price')
features_test,output_test = get_numpy_data(test, feature_list, 'price')

In [315]:
set(my_features) - set(feature_list)


Out[315]:
set()

In [316]:
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms

Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set?


In [317]:
print(features_test[0])
print(features_train[9])


[ 0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01163464  0.00602491  0.0083488   0.00050756  0.01279425  0.          0.
  0.01938684  0.01390535  0.0096309   0.          0.01302544  0.
  0.01346821 -0.01346251  0.01195898  0.00156612]

In [318]:
import math

In [319]:
def get_distance(vec1, vec2):
    return math.sqrt(np.sum((vec1 - vec2)**2))

In [320]:
get_distance(features_test[0], features_train[9])


Out[320]:
0.05972359371398078

Quiz Question: Among the first 10 training houses, which house is the closest to the query house?


In [321]:
min_distance = None
closest_house = None
for i, train_house in enumerate(features_train[0:10]):
    dist = get_distance(features_test[0], train_house)
    if i == 0 or dist < min_distance:
        min_distance = dist
        closest_house = i

In [322]:
print(min_distance)
print(closest_house)


0.052383627840220305
8

In [323]:
diff = features_train - features_test[0]

In [324]:
np.sum(diff[-1], axis=0)


Out[324]:
-0.093433998746546426

In [325]:
dist = np.sqrt(np.sum(diff**2, axis=1))

In [326]:
dist[100]


Out[326]:
0.023708232416678195

In [327]:
def compute_distances(features_instances, features_query):
    diff = features_instances - features_query
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances

17. Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?


In [328]:
distances = compute_distances(features_train, features_test[2])
print(distances)
print(np.argmin(distances))


[ 0.01954476  0.06861035  0.02165079 ...,  0.02433478  0.02622734
  0.02637942]
382

In [296]:
np.where(distances == min(distances))


Out[296]:
(array([382]),)

In [297]:
distances[1149]


Out[297]:
0.0032258402701799303

In [298]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = compute_distances(features_train, features_query)
    return distances, np.argsort(distances)[:k]

In [299]:
distances, neighbours = k_nearest_neighbors(4, features_train, features_test[2])

In [300]:
for n in neighbours:
    print(distances[n])
print(neighbours)


0.00286049555751
0.00322584027018
0.00350215633337
0.00359315383341
[ 382 1149 4087 3142]

In [301]:
print(neighbours)


[ 382 1149 4087 3142]

In [302]:
def predict_output_of_query(k, features_train, output_train, features_query):
    distances, neighbours = k_nearest_neighbors(k, features_train, features_query)
    prediction = output_train[neighbours].mean()
    return prediction

In [303]:
predict_output_of_query(1, features_train, output_train, features_test[2])


Out[303]:
249000.0

In [304]:
predict_output_of_query(4, features_train, output_train, features_test[2])


Out[304]:
413987.5

In [305]:
print(output_test[2])


[ 438000.]

In [306]:
def predict_output(k, features_train, output_train, features_query):
    #distances, neighbours = k_nearest_neighbors(k, features_train, features_query)
    predictions = np.zeros((features_query.shape[0], 1))
    for i in range(features_query.shape[0]):
        predictions[i,0] = predict_output_of_query(k,features_train, output_train, features_query[i])
    return predictions

In [307]:
predictions = predict_output(10, features_train, output_train, features_test[:10])
print(predictions)
print(np.argmin(predictions))


[[ 881300. ]
 [ 431860. ]
 [ 460595. ]
 [ 430200. ]
 [ 766750. ]
 [ 667420. ]
 [ 350032. ]
 [ 512800.7]
 [ 484000. ]
 [ 457235. ]]
6

In [308]:
print(output_test[:10])


[[ 650000.]
 [ 485000.]
 [ 438000.]
 [ 535000.]
 [ 785000.]
 [ 975000.]
 [ 287000.]
 [ 355000.]
 [ 305000.]
 [ 518500.]]

In [309]:
rsss = []
for k in range(1,16):
    predictions = predict_output(k, features_train, output_train, features_valid)
    error = predictions - output_valid
    rss = error.T.dot(error)
    print('RSS for k=%s: %s' % (k, rss))
    rsss.append(rss)


RSS for k=1: [[  1.05453830e+14]]
RSS for k=2: [[  8.34450735e+13]]
RSS for k=3: [[  7.26920960e+13]]
RSS for k=4: [[  7.19467217e+13]]
RSS for k=5: [[  6.98465174e+13]]
RSS for k=6: [[  6.88995444e+13]]
RSS for k=7: [[  6.83419735e+13]]
RSS for k=8: [[  6.73616787e+13]]
RSS for k=9: [[  6.83727280e+13]]
RSS for k=10: [[  6.93350487e+13]]
RSS for k=11: [[  6.95238552e+13]]
RSS for k=12: [[  6.90499696e+13]]
RSS for k=13: [[  7.00112545e+13]]
RSS for k=14: [[  7.09086989e+13]]
RSS for k=15: [[  7.11069284e+13]]

In [310]:
predictions = predict_output(3, features_train, output_train, features_test)
error = predictions - output_test
rss = error.T.dot(error)
print(rss)


[[  1.49008587e+14]]

In [ ]: